# -*- coding: utf-8 -*-
import scrapy
from bookScrapy.items import BookscrapyItem

class BookscrapySpider(scrapy.Spider):
    name = 'bookScrapy'
    allowed_domains = ['douban.com']
    start_urls = ['https://book.douban.com/tag/金融']

    def parse(self, response):
        books = response.xpath('//li[@class="subject-item"]/div[@class="info"]')
        for oneBook in books:
            item = BookscrapyItem()
            title = oneBook.xpath("normalize-space( h2[@class='']/a/text()) ").extract_first()
            author = oneBook.xpath("normalize-space( div[@class='pub']/text()  )").extract_first().split('/')[1]
            publishingHouse = oneBook.xpath("normalize-space( div[@class='pub']/text()  )").extract_first().split('/')[-3]
            dateTime = oneBook.xpath("normalize-space( div[@class='pub']/text()  )").extract_first().split('/')[-2]
            price = oneBook.xpath("normalize-space( div[@class='pub']/text()  )").extract_first().split('/')[-1]
            score = oneBook.xpath("div[@class='star clearfix']/span[@class='rating_nums']/text()").extract_first()
            commentsNumber = oneBook.xpath("normalize-space(div[@class='star clearfix']/span[@class='pl']/text())").extract_first()

            item['title'] = title
            item['author'] = author
            item['publishingHouse'] = publishingHouse
            item['dateTime'] = dateTime
            item['price'] = price
            item['score'] = score
            item['commentsNumber'] = commentsNumber
            yield item

        #下一页的链接地址
        nextUrl = response.xpath("//span[@class='next']/link/@href").extract_first()
        #如果urll里有这个内容，则退出
        #只爬前10页的数据
        if (nextUrl.find('start=200') != -1):
            return
        else:
            #用yield递归调用，爬取下一页的内容
            yield scrapy.Request("https://book.douban.com" + nextUrl, callback=self.parse,dont_filter=True)